employee <- read_dta("EmployeeStatus.dta")
employee_char <- read_dta("EmployeeCharacteristics.dta")
quits <- read_dta("Quits.dta")
quits_d <- read_dta("QuitDate.dta")
attitude <- read_dta("Attitudes.dta")
performance_p <- read_dta("Performance_Panel.dta")
performance <- read_dta("Performance.dta")
# create a list for all 7 datasets
data_list <- list(
employee = employee,
employee_char = employee_char,
quits = quits,
quits_d = quits_d,
attitude = attitude,
performance_p = performance_p,
performance = performance
)
# loop through and show summary of all 7 datasets
for (dataset_name in names(data_list)) {
cat(dataset_name, ":\n")
print(summary(data_list[[dataset_name]]))
cat("\n")
}
## employee :
## personid treatment
## Min. : 3906 Min. :0.0000
## 1st Qu.:19470 1st Qu.:0.0000
## Median :31936 Median :1.0000
## Mean :29620 Mean :0.5261
## 3rd Qu.:39748 3rd Qu.:1.0000
## Max. :45442 Max. :1.0000
##
## employee_char :
## personid prior_experience age tenure
## Min. : 3906 Min. :-99.00 Min. :-99.0 Min. :-99.00
## 1st Qu.:19470 1st Qu.: 0.00 1st Qu.: 22.0 1st Qu.: 9.00
## Median :31936 Median : 6.00 Median : 24.0 Median : 23.00
## Mean :29620 Mean : 16.93 Mean : 23.4 Mean : 26.22
## 3rd Qu.:39748 3rd Qu.: 26.00 3rd Qu.: 26.0 3rd Qu.: 44.00
## Max. :45442 Max. :180.00 Max. : 35.0 Max. : 96.00
## basewage bonus grosswage costofcommute
## Min. :1156 Min. : 123.2 Min. :1388 Min. : 0.000
## 1st Qu.:1450 1st Qu.: 620.7 1st Qu.:2403 1st Qu.: 4.000
## Median :1550 Median : 909.3 Median :2786 Median : 8.000
## Mean :1551 Mean :1060.1 Mean :2975 Mean : 8.103
## 3rd Qu.:1617 3rd Qu.:1294.0 3rd Qu.:3485 3rd Qu.:10.000
## Max. :2567 Max. :3965.9 Max. :6221 Max. :55.000
## rental male married high_school
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:1.0000
## Median :0.0000 Median :0.0000 Median :0.0000 Median :1.0000
## Mean :0.2249 Mean :0.4659 Mean :0.2691 Mean :0.8434
## 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
##
## quits :
## personid quitjob
## Min. : 3906 Min. :0.000
## 1st Qu.:19470 1st Qu.:0.000
## Median :31936 Median :0.000
## Mean :29620 Mean :0.249
## 3rd Qu.:39748 3rd Qu.:0.000
## Max. :45442 Max. :1.000
##
## quits_d :
## personid year month stillworking
## Min. : 3906 Min. :2010 Min. : 1.000 Min. :0.0000
## 1st Qu.:19470 1st Qu.:2011 1st Qu.: 3.000 1st Qu.:1.0000
## Median :31936 Median :2011 Median : 5.000 Median :1.0000
## Mean :29620 Mean :2011 Mean : 5.333 Mean :0.8831
## 3rd Qu.:39748 3rd Qu.:2011 3rd Qu.: 7.000 3rd Qu.:1.0000
## Max. :45442 Max. :2011 Max. :12.000 Max. :1.0000
## quit
## Min. :0.00000
## 1st Qu.:0.00000
## Median :0.00000
## Mean :0.02767
## 3rd Qu.:0.00000
## Max. :1.00000
##
## attitude :
## personid surveyno post satisfaction general
## Min. : 3906 Min. :1 Min. :0.0 Min. :1.000 Min. : 32.00
## 1st Qu.:18112 1st Qu.:2 1st Qu.:1.0 1st Qu.:4.000 1st Qu.: 64.50
## Median :31136 Median :3 Median :1.0 Median :5.000 Median : 73.00
## Mean :29094 Mean :3 Mean :0.8 Mean :4.696 Mean : 72.78
## 3rd Qu.:39512 3rd Qu.:4 3rd Qu.:1.0 3rd Qu.:6.000 3rd Qu.: 80.00
## Max. :45442 Max. :5 Max. :1.0 Max. :7.000 Max. :100.00
## life
## Min. : 2.00
## 1st Qu.:16.00
## Median :22.00
## Mean :21.46
## 3rd Qu.:27.00
## Max. :38.00
##
## performance_p :
## personid year month post
## Min. : 3906 Min. :2010 Min. : 1.000 Min. :0.000
## 1st Qu.:19470 1st Qu.:2010 1st Qu.: 3.000 1st Qu.:0.000
## Median :31292 Median :2010 Median : 6.000 Median :0.000
## Mean :29369 Mean :2010 Mean : 5.777 Mean :0.425
## 3rd Qu.:39512 3rd Qu.:2011 3rd Qu.: 8.000 3rd Qu.:1.000
## Max. :45442 Max. :2011 Max. :12.000 Max. :1.000
##
## performance_score total_monthly_calls calls_per_hour
## Min. : 49.69 Min. :-999999 Min. : 12.21
## 1st Qu.: 73.42 1st Qu.: 1346 1st Qu.: 18.65
## Median : 79.37 Median : 1852 Median : 20.52
## Mean : 79.80 Mean : 1639 Mean : 20.58
## 3rd Qu.: 84.72 3rd Qu.: 2396 3rd Qu.: 22.31
## Max. :1000.00 Max. : 5035 Max. :200.00
## NA's :139 NA's :141
##
## performance :
## personid post performance_score total_monthly_calls
## Min. : 3906 Min. :0.0 Min. : 49.69 Min. :-109647
## 1st Qu.:19470 1st Qu.:0.0 1st Qu.: 73.60 1st Qu.: 1398
## Median :31936 Median :0.5 Median : 78.74 Median : 1815
## Mean :29620 Mean :0.5 Mean : 79.08 Mean : 1615
## 3rd Qu.:39748 3rd Qu.:1.0 3rd Qu.: 82.97 3rd Qu.: 2196
## Max. :45442 Max. :1.0 Max. :283.19 Max. : 4367
## calls_per_hour
## Min. :13.88
## 1st Qu.:18.92
## Median :20.53
## Mean :20.60
## 3rd Qu.:22.13
## Max. :39.26
plot_scatter_first_column <- function(data, dataset_name) {
numeric_vars <- sapply(data, is.numeric)
numeric_names <- names(data)[numeric_vars]
if (length(numeric_names) > 1) {
x_axis <- numeric_names[1] # use personid as the x-axis
for (y_axis in numeric_names[-1]) {
plot <- ggplot(data, aes_string(x = x_axis, y = y_axis)) +
geom_point(alpha = 0.5) +
labs(title = paste("Scatter Plot of", y_axis, "vs", x_axis, "in", dataset_name),
x = x_axis, y = y_axis) +
theme_minimal()
print(plot)
}
} else {
cat("No enough numeric variables.")
}
}
# loop through each dataset and generate plots
for (dataset_name in names(data_list)) {
cat("Generating plots for", dataset_name, "\n")
plot_scatter_first_column(data_list[[dataset_name]], dataset_name)
}
## Generating plots for employee
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Generating plots for employee_char











## Generating plots for quits

## Generating plots for quits_d




## Generating plots for attitude





## Generating plots for performance_p



## Warning: Removed 139 rows containing missing values (`geom_point()`).


## Warning: Removed 141 rows containing missing values (`geom_point()`).

## Generating plots for performance




# dataset2. employee_char
data_list[[2]] <- data_list[[2]] %>%
mutate(
prior_experience = if_else(prior_experience < 0, NA, prior_experience),
age = if_else(age < 0, NA, age),
tenure = if_else(tenure < 0, NA, tenure)
)
summary(data_list[[2]]) # double check the summary
## personid prior_experience age tenure
## Min. : 3906 Min. : 0.00 Min. :18.00 Min. : 2.00
## 1st Qu.:19470 1st Qu.: 0.00 1st Qu.:22.00 1st Qu.: 9.00
## Median :31936 Median : 6.00 Median :24.00 Median :23.00
## Mean :29620 Mean : 17.87 Mean :24.39 Mean :27.23
## 3rd Qu.:39748 3rd Qu.: 28.00 3rd Qu.:26.50 3rd Qu.:44.00
## Max. :45442 Max. :180.00 Max. :35.00 Max. :96.00
## NA's :2 NA's :2 NA's :2
## basewage bonus grosswage costofcommute
## Min. :1156 Min. : 123.2 Min. :1388 Min. : 0.000
## 1st Qu.:1450 1st Qu.: 620.7 1st Qu.:2403 1st Qu.: 4.000
## Median :1550 Median : 909.3 Median :2786 Median : 8.000
## Mean :1551 Mean :1060.1 Mean :2975 Mean : 8.103
## 3rd Qu.:1617 3rd Qu.:1294.0 3rd Qu.:3485 3rd Qu.:10.000
## Max. :2567 Max. :3965.9 Max. :6221 Max. :55.000
##
## rental male married high_school
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:1.0000
## Median :0.0000 Median :0.0000 Median :0.0000 Median :1.0000
## Mean :0.2249 Mean :0.4659 Mean :0.2691 Mean :0.8434
## 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
##
# dataset6. performance_p
data_list[[6]] <- data_list[[6]] %>%
mutate(
performance_score = ifelse(performance_score > 100, NA, performance_score),
total_monthly_calls = ifelse(total_monthly_calls < 0, NA, total_monthly_calls),
calls_per_hour = ifelse(calls_per_hour > 100, NA, calls_per_hour)
)
summary(data_list[[6]])
## personid year month post
## Min. : 3906 Min. :2010 Min. : 1.000 Min. :0.000
## 1st Qu.:19470 1st Qu.:2010 1st Qu.: 3.000 1st Qu.:0.000
## Median :31292 Median :2010 Median : 6.000 Median :0.000
## Mean :29369 Mean :2010 Mean : 5.777 Mean :0.425
## 3rd Qu.:39512 3rd Qu.:2011 3rd Qu.: 8.000 3rd Qu.:1.000
## Max. :45442 Max. :2011 Max. :12.000 Max. :1.000
##
## performance_score total_monthly_calls calls_per_hour
## Min. : 49.69 Min. : 0 Min. :12.21
## 1st Qu.: 73.41 1st Qu.:1346 1st Qu.:18.65
## Median : 79.37 Median :1852 Median :20.52
## Mean : 79.17 Mean :1860 Mean :20.54
## 3rd Qu.: 84.70 3rd Qu.:2396 3rd Qu.:22.31
## Max. :100.00 Max. :5035 Max. :28.80
## NA's :142 NA's :1 NA's :142
# dataset7. performance
data_list[[7]] <- data_list[[7]] %>%
mutate(
performance_score = if_else(performance_score > 100, NA, performance_score),
total_monthly_calls = if_else(total_monthly_calls < 0, NA, total_monthly_calls),
calls_per_hour = if_else(calls_per_hour > 35, NA, calls_per_hour)
)
summary(data_list[[7]])
## personid post performance_score total_monthly_calls
## Min. : 3906 Min. :0.0 Min. : 49.69 Min. : 228.8
## 1st Qu.:19470 1st Qu.:0.0 1st Qu.: 73.55 1st Qu.:1397.8
## Median :31936 Median :0.5 Median : 78.72 Median :1816.0
## Mean :29620 Mean :0.5 Mean : 78.47 Mean :1838.6
## 3rd Qu.:39748 3rd Qu.:1.0 3rd Qu.: 82.93 3rd Qu.:2196.6
## Max. :45442 Max. :1.0 Max. :100.00 Max. :4367.1
## NA's :2 NA's :1
## calls_per_hour
## Min. :13.88
## 1st Qu.:18.92
## Median :20.52
## Mean :20.57
## 3rd Qu.:22.13
## Max. :28.80
## NA's :1
# merge two datasets, employee and attitude
satis_lev <- inner_join(data_list[[1]], data_list[[5]], by = "personid")
sorted_satis_level <- arrange(satis_lev, personid)
head(sorted_satis_level) # preview the data
## # A tibble: 6 × 7
## personid treatment surveyno post satisfaction general life
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 3906 1 1 0 7 78 18
## 2 3906 1 2 1 3 72 17
## 3 3906 1 3 1 5 65 15
## 4 3906 1 4 1 2 72 19
## 5 3906 1 5 1 6 73 20
## 6 4122 1 1 0 4 72 10
# create dummy variables and interaction terms
sorted_satis_level1 <- sorted_satis_level %>%
mutate(time1 = as.numeric(surveyno == 1),
time2 = as.numeric(surveyno == 2),
time3 = as.numeric(surveyno == 3),
time4 = as.numeric(surveyno == 4),
time5 = as.numeric(surveyno == 5)) %>%
mutate(time1_treat = time1 * treatment,
time2_treat = time2 * treatment,
time3_treat = time3 * treatment,
time4_treat = time4 * treatment,
time5_treat = time5 * treatment)
# run the OLS regression
model1 <- lm(satisfaction ~ time1_treat + time2_treat + time3_treat + time4_treat + time5_treat, data = sorted_satis_level1)
summary(model1)
##
## Call:
## lm(formula = satisfaction ~ time1_treat + time2_treat + time3_treat +
## time4_treat + time5_treat, data = sorted_satis_level1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.0841 -0.6262 0.1121 0.9159 2.6125
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.38750 0.07437 58.999 < 2e-16 ***
## time1_treat 0.23867 0.14856 1.607 0.108520
## time2_treat 0.69661 0.14856 4.689 3.19e-06 ***
## time3_treat 0.47231 0.14856 3.179 0.001530 **
## time4_treat 0.50035 0.14856 3.368 0.000791 ***
## time5_treat 0.55643 0.14856 3.746 0.000192 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.33 on 849 degrees of freedom
## Multiple R-squared: 0.0387, Adjusted R-squared: 0.03304
## F-statistic: 6.835 on 5 and 849 DF, p-value: 2.906e-06
# merge three datasets, employee, employee_char and performance
performance <- data_list[[1]] %>%
left_join(data_list[[2]], by = "personid") %>%
left_join(data_list[[7]], by = "personid") %>%
arrange(performance)
head(performance) # preview the data
## # A tibble: 6 × 17
## personid treatment prior_experience age tenure basewage bonus grosswage
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 33278 0 0 20 22 1450 594. 2275.
## 2 33278 0 0 20 22 1450 594. 2275.
## 3 44800 0 5 25 2 1500 588. 2742.
## 4 44800 0 5 25 2 1500 588. 2742.
## 5 32320 0 0 22 23 1600 2272 4578.
## 6 32320 0 0 22 23 1600 2272 4578.
## # ℹ 9 more variables: costofcommute <dbl>, rental <dbl+lbl>, male <dbl>,
## # married <dbl>, high_school <dbl>, post <dbl>, performance_score <dbl>,
## # total_monthly_calls <dbl>, calls_per_hour <dbl>
# create an interaction variable
performance$post_treatment <- performance$post * performance$treatment
# run OLS regression
model2 <- lm(performance_score ~ prior_experience + post + age + tenure + bonus + post_treatment, data = performance)
summary(model2)
##
## Call:
## lm(formula = performance_score ~ prior_experience + post + age +
## tenure + bonus + post_treatment, data = performance)
##
## Residuals:
## Min 1Q Median 3Q Max
## -23.9691 -3.8610 0.6565 4.0983 17.2268
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 72.0460028 2.4961700 28.863 < 2e-16 ***
## prior_experience -0.0215897 0.0131627 -1.640 0.102
## post -3.4108912 0.6524694 -5.228 2.56e-07 ***
## age 0.1198674 0.1187499 1.009 0.313
## tenure -0.0272765 0.0173378 -1.573 0.116
## bonus 0.0051449 0.0005008 10.274 < 2e-16 ***
## post_treatment 3.6218727 0.7444218 4.865 1.55e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.811 on 485 degrees of freedom
## (6 observations deleted due to missingness)
## Multiple R-squared: 0.2513, Adjusted R-squared: 0.242
## F-statistic: 27.13 on 6 and 485 DF, p-value: < 2.2e-16
# merge three datasets, employee, employee_char and quits
retention <- data_list[[1]] %>%
left_join(data_list[[2]], by = "personid") %>%
left_join(data_list[[3]], by = "personid")
# Sort by personid in ascending order
sorted_retention <- retention %>%
arrange(personid)
head(sorted_retention) # preview the data
## # A tibble: 6 × 14
## personid treatment prior_experience age tenure basewage bonus grosswage
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 3906 1 84 33 96 1883. 713. 2738.
## 2 4122 1 0 30 94 1700 2097. 4096.
## 3 4448 0 0 35 92 1750 2153. 4269.
## 4 4942 1 0 27 82 1900 3043 5295.
## 5 5018 1 0 29 82 2139. 464. 1388.
## 6 5614 1 0 32 79 1700 1274. 3826.
## # ℹ 6 more variables: costofcommute <dbl>, rental <dbl+lbl>, male <dbl>,
## # married <dbl>, high_school <dbl>, quitjob <dbl>
# conduct a logistic regression
model3 <- glm(quitjob ~ treatment + prior_experience + age + tenure + basewage + bonus + grosswage + costofcommute + rental + male + married + high_school, data = sorted_retention, family = binomial)
summary(model3)
##
## Call:
## glm(formula = quitjob ~ treatment + prior_experience + age +
## tenure + basewage + bonus + grosswage + costofcommute + rental +
## male + married + high_school, family = binomial, data = sorted_retention)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.7319252 2.5681927 0.285 0.775647
## treatment -1.1453022 0.3384103 -3.384 0.000713 ***
## prior_experience 0.0110571 0.0088475 1.250 0.211395
## age -0.0939244 0.0894540 -1.050 0.293730
## tenure -0.0167165 0.0147146 -1.136 0.255934
## basewage 0.0020688 0.0017527 1.180 0.237862
## bonus 0.0009131 0.0009373 0.974 0.329940
## grosswage -0.0015550 0.0007494 -2.075 0.037974 *
## costofcommute 0.0388097 0.0234825 1.653 0.098391 .
## rental 0.6882141 0.4272095 1.611 0.107190
## male 0.3721771 0.3657949 1.017 0.308941
## married 0.2564367 0.4604334 0.557 0.577564
## high_school 0.8911633 0.5440506 1.638 0.101418
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 278.34 on 246 degrees of freedom
## Residual deviance: 234.27 on 234 degrees of freedom
## (2 observations deleted due to missingness)
## AIC: 260.27
##
## Number of Fisher Scoring iterations: 5